#
# impbaum.r
#
# format the raw data and run imputation model for replication of Baum and Lake (2003)
#
# 23/2/10, jH
#

rm(list=ls())

library("Amelia")

print(packageDescription("Amelia"))

x<-read.csv("baummerge2.csv")

# REMOVE MYSTERIOUS ZERO FROM THIS VARIABLE
flag<-x$femlifeex==0
flag[is.na(flag)]<-FALSE
x$femlifeex[flag]<-NA

# CREATE INTERACTIONS AND OTHER RESCALES
x$polity2[x$polity2== -99]<-NA

polflag1<-x$polity2> -11
polflag2<-x$polity2> -6
polflag3<-x$polity2> 0
polflag4<-x$polity2> 5

x$polity2<- polflag1 + polflag2 + polflag3 + polflag4

x$temprich<-x$gdppc>2500
x$temprichdem<-x$temprich * x$polity2

###########################
# SUBSET COUNTRIES  (THESE ARE COUNTRIES TO KEEP TO CREATE A SMALL TEST SUBSET)
#subcty<-c("USA","CAN","DOM","JAM","CUB","MEX","SLV","CRI","PER","BRA","CHL","GBR","FRA","CHE","ITA","SWE","NOR","CIV","CMR","CAF","TZA","SAU","OMN","MYS","SGP","AUS","NZL")
#
#subdata<-x[x$countrycode==subcty[1],]
#for(i in 2:length(subcty)){
#  temp<-x[x$countrycode==subcty[i],]
#  subdata<-rbind(subdata,temp)
#}
#
#subvars<-c("countrycode","countryname","year","femlifeex","femschool","gdpgrowth","gdppc","poptot","polity2","temprich","temprichdem")
#
#subdata<-subdata[ ,subvars]
#
#print(names(subdata))
#print(subdata[1:3,])
############################

# SUBSET COUNTRIES  (THESE ARE COUNTRIES TO REMOVE TO CREATE A LARGE SUBSET, REMOVING COUNTRIES FOR WHICH THERE IS NO DATA)
negcty<-c("DMA",1977,"GRD",1977,"LCA",1980,"VCT",1977,"ARG",1977,"KNA",1977,"CZE",1980,"SVK",1980,"GEO",1979,"KGZ",1980,"UZB",1980,"KAZ",1980,"COM",1980,"KIR",1970)
negctymat<-t(matrix(negcty,2,length(negcty)/2))

flag<-x$year>0
for(i in 1:nrow(negctymat)){
  tempflag<-x$countrycode==negctymat[i,1] & x$year>=negctymat[i,2]
  flag<-flag & !tempflag
  print(c(negctymat[i,1],sum(tempflag)))
}

negdata<-x[flag,]

subvars<-c("countrycode","countryname","year","femlifeex","femschool","gdpgrowth","gdppc","poptot","polity2","temprich","temprichdem")

negdata<-negdata[ ,subvars]

print(names(negdata))
print(negdata[1:3,])

baumidvars<-"countryname"

b<-round(runif(1,1111,9999))
name<-paste("hh",b,sep="")               # This is way to create a random file name for the imputed dataset
                                         # Useful in a parallel environment, such as "Condor" where all outputs will be dumped
                                         # in one directory.  If you are from the future, where machines are faster, use instead
                                         # name<-"h", and increase m to 100 in amelia() options.
print(c("NAME OF OUTFILE",name))

# IMPUTATION MODEL FOR FEMLIFEEX (femlifeex ~ sqrt(femschool))
#output<-amelia(subdata,cs="countrycode",tolerance=0.0001,ts="year",m=1,intercs=TRUE,polytime=3,logs=c("gdppc","poptot"),sqrts="femschool",idvars=baumidvars,outname=name,empri=5,p2s=2,incheck=FALSE)

output<-amelia(negdata,cs="countrycode",tolerance=0.0005,ts="year",m=1,intercs=TRUE,polytime=2,idvars=baumidvars,outname=name,p2s=2,incheck=FALSE,autopri=FALSE,logs=c("gdppc","poptot"),emburn=c(0,100),sqrts="femschool",ords="polity2")

print("finished")

